Discriminant Factors
library(fpc) # pamk
library(cluster) # pam
library(ape)
df=read.csv('../data/clustering_features.csv.gz')
id.vector=paste(df$collection,df$model,sep='_')
rownames(df)=id.vector
df.num=subset(df,select=-c(1:2))
colsCollection=c("#A6A9AA","#000000","#3E7CBC","#A3D2E2","#7E8082","#EDA85F","#CD2028") #labels=c('agora','bigg','ebrahim','embl','path','seed','uminho')
#############################################
### plot.rf.var.importance.by.class.heatmap
#############################################
# Plot heatmap with variable importance independent by predicted class.
# Args:
# model: random forest model already build
# predVar: string of column ID with predictor/variables names values
# classVar: string of class variable in 'df'
# title: header of the plot
plot.rf.var.importance.by.class.heatmap <- function(model,predVar,classVar,title){
imp.df=melt(importance(model)[,1:length(model$classes)])
colnames(imp.df)=c(predVar,classVar,'testImportance')
# a.-Order rows
pred.order=names(sort(importance(model)[,'MeanDecreaseAccuracy'])) # My order according to global MeandDecreaseAccuracy
imp.df[,predVar] <- factor(imp.df[,predVar], levels = pred.order)
ggplot(data = imp.df, aes_string(x = classVar, y = predVar, fill= 'testImportance')) + geom_tile() + scale_fill_gradient2() +
theme(axis.text.x = element_text(angle = 270, hjust = 1)) +
ggtitle(title)
}
#############################################
### plot.rf.var.importance.by.class.dotplot
#############################################
# Plot dotplot with variable importance independent by predicted class.
# Args:
# model: random forest model already build
# predVar: string of column ID with predictor/variables names values
# classVar: string of class variable in 'df'
# title: header of the plot
plot.rf.var.importance.by.class.dotplot <- function(model,predVar,classVar,title){
imp.df=melt(importance(model)[,1:length(model$classes)])
colnames(imp.df)=c(predVar,classVar,'value')
# a.-Order rows
pred.order=names(sort(importance(model)[,'MeanDecreaseAccuracy'])) # My order according to global MeandDecreaseAccuracy
imp.df[,predVar] <- factor(imp.df[,predVar], levels = pred.order)
imp.df[,classVar] <- factor(imp.df[,classVar])
# b.-Order class
ggplot(imp.df, aes_string(x = 'value', y = predVar, group = predVar, colour = classVar)) +
geom_segment(aes_string(yend=predVar), xend=0, colour="grey50") +
geom_point( size = 1) +
scale_color_manual(values=colsCollection) +
theme_bw() +
facet_grid(reformulate(classVar)) +
theme(panel.grid.major.y = element_blank()) +
theme(text = element_text(size=12)) +
xlab(paste(predVar," importance (Mean Decrease in Accuracy in RandomForest)",sep='')) +
theme(axis.text.x = element_text(angle = 270, hjust = 1)) +
theme(legend.position="none")
}
####################################################
### plot.rf.var.importance.by.class.andMean.heatmap
###################################################
# Plot heatmap with variable importance mean over all classes
# Args:
# model: random forest model already build
# predVar: string of column ID with predictor/variables names values
# classVar: string of class variable in 'df'
# title: header of the plot
plot.rf.var.importance.by.class.andMean.heatmap <- function(model,predVar,classVar,title){
imp.df=melt(importance(model)[,1:(length(model$classes)+1)])
colnames(imp.df)=c(predVar,classVar,'testImportance')
# a.-Order rows
pred.order=names(sort(importance(model)[,'MeanDecreaseAccuracy'])) # My order according to global MeandDecreaseAccuracy
imp.df[,predVar] <- factor(imp.df[,predVar], levels = pred.order)
# Change names classes (MeanDecreasAccuracy --> Mean)
class.names=levels(imp.df[,classVar])
levels(imp.df[,classVar]) <- c(class.names[1:(length(class.names)-1)],"MEAN")
imp.df[,classVar] <- factor(imp.df[,classVar])
ggplot(data = imp.df, aes_string(x = classVar, y = predVar, fill= 'testImportance')) + geom_tile() + scale_fill_gradient2() +
theme(axis.text.x = element_text(angle = 270, hjust = 1)) +
ggtitle(title)
}
####################################################
### plot.rf.var.importance.by.class.andMean.dotplot
####################################################
# Plot dotplot with variable importance mean over all classes
# Args:
# model: random forest model already build
# predVar: string of column ID with predictor/variables names values
# classVar: string of class variable in 'df'
# colorVector: vector of colors
# nBestFeatures: number of top relevant features to show in the plot.
# classNames: vector with ad-hoc class names.
plot.rf.var.importance.by.class.andMean.dotplot <- function(model,predVar,classVar,colorVector=NULL,nBestFeatures=NULL,classNames=NULL){
imp.df=melt(importance(model)[,1:(length(model$classes)+1)])
colnames(imp.df)=c(predVar,classVar,'value')
# a.-Order rows
pred.order=names(sort(importance(model)[,'MeanDecreaseAccuracy'])) # My order according to global MeandDecreaseAccuracy
imp.df[,predVar] <- factor(imp.df[,predVar], levels = pred.order)
class.names=levels(imp.df[,classVar])
if(!is.null(classNames)){
levels(imp.df[,classVar]) <- c(classNames,'MEAN')
}else{
levels(imp.df[,classVar]) <- c(class.names[1:(length(class.names)-1)],'MEAN')
}
imp.df[,classVar] <- factor(imp.df[,classVar])
# b.- Subset test to show
if(!is.null(nBestFeatures)){
imp.df=subset(imp.df,subset=(test %in% tail(pred.order,n=nBestFeatures)))
}
p <- ggplot(imp.df, aes_string(x = 'value', y = predVar, group = predVar, colour = classVar)) +
geom_segment(aes_string(yend=predVar), xend=0, colour="grey50") +
geom_point( size = 3) +
theme_bw() +
facet_grid(reformulate(classVar)) +
theme(panel.grid.major.y = element_blank()) +
theme(text = element_text(size=16)) +
xlab(paste(predVar," importance (Mean Decrease in Accuracy)",sep='')) +
theme(axis.text.x = element_text(angle = 270, hjust = 1)) +
theme(legend.position="none")
if(!is.null(colorVector)){
p + scale_color_manual(values=colorVector)
}else{
p
}
}
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
df.rf = subset(df, select = -c(2))
set.seed(123)
train_control <-
trainControl(method = "cv",
number = 10,
savePredictions = "all")
rf_file <- "../data/rf_model_classCollection.Rdata"
if (!file.exists(rf_file)) {
model <-
train(
form = collection ~ .,
data = df.rf,
trControl = train_control,
method = "rf",
ntree = 1000,
importance = TRUE,
localImp = TRUE,
na.action = na.omit
)
save(model, file = "../data/rf_model_classCollection.Rdata")
} else {
load(rf_file)
}
print(model$finalModel)
##
## Call:
## randomForest(x = x, y = y, ntree = 1000, mtry = param$mtry, importance = TRUE, localImp = TRUE)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 65
##
## OOB estimate of error rate: 0.02%
## Confusion matrix:
## agora bigg carveme ebrahim kbase optflux path class.error
## agora 801 0 0 0 0 0 0 0.00000000
## bigg 0 36 0 0 0 0 0 0.00000000
## carveme 0 0 5511 0 0 0 0 0.00000000
## ebrahim 0 0 1 79 0 0 0 0.01250000
## kbase 0 0 0 0 1632 0 0 0.00000000
## optflux 0 0 1 0 0 78 0 0.01265823
## path 0 0 0 0 0 0 2641 0.00000000
# Variable Importance
varImpPlot(model$finalModel,type=1)

# Confusion matrix
print(confusionMatrix(model$pred$pred,model$pred$obs))
## Confusion Matrix and Statistics
##
## Reference
## Prediction agora bigg carveme ebrahim kbase optflux path
## agora 2403 0 0 1 0 0 0
## bigg 0 108 0 0 0 0 0
## carveme 0 0 16530 81 0 80 10
## ebrahim 0 0 3 158 0 0 0
## kbase 0 0 0 0 4896 0 0
## optflux 0 0 0 0 0 157 0
## path 0 0 0 0 0 0 7913
##
## Overall Statistics
##
## Accuracy : 0.9946
## 95% CI : (0.9937, 0.9954)
## No Information Rate : 0.5112
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9916
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: agora Class: bigg Class: carveme
## Sensitivity 1.00000 1.00000 0.9998
## Specificity 0.99997 1.00000 0.9892
## Pos Pred Value 0.99958 1.00000 0.9898
## Neg Pred Value 1.00000 1.00000 0.9998
## Prevalence 0.07430 0.00334 0.5112
## Detection Rate 0.07430 0.00334 0.5111
## Detection Prevalence 0.07434 0.00334 0.5164
## Balanced Accuracy 0.99998 1.00000 0.9945
## Class: ebrahim Class: kbase Class: optflux
## Sensitivity 0.658333 1.0000 0.662447
## Specificity 0.999907 1.0000 1.000000
## Pos Pred Value 0.981366 1.0000 1.000000
## Neg Pred Value 0.997452 1.0000 0.997514
## Prevalence 0.007421 0.1514 0.007328
## Detection Rate 0.004886 0.1514 0.004855
## Detection Prevalence 0.004978 0.1514 0.004855
## Balanced Accuracy 0.829120 1.0000 0.831224
## Class: path
## Sensitivity 0.9987
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9996
## Prevalence 0.2450
## Detection Rate 0.2447
## Detection Prevalence 0.2447
## Balanced Accuracy 0.9994
model=model$finalModel
pred=model$pred
# Change names classes (MeanDecreasAccuracy --> Mean)
class.names=levels(model$classes)
levels(model$classes) <-c('AGORA','CarveMe','Path2Models','KBase','BiGG','Ebrahim et al.','OptFlux')
model$classes <- factor(model$classes)
library(reshape2)
plot.rf.var.importance.by.class.andMean.dotplot(model,'test','collection',colorVec=c(colsCollection,'#60d660'))

plot.rf.var.importance.by.class.andMean.heatmap(model,'test','collection','Feature importance (Mean Decreasy in Accuracy in Random Forest)')

plot.rf.var.importance.by.class.andMean.dotplot(model,'test','collection',colorVec=c(colsCollection,'#60d660'),nBestFeatures=15,classNames=c('AGORA','CarveMe','Path2Models','KBase','BiGG','Ebrahim et al.','OptFlux'))

Clustering
fitPamBest <- pamk(df.num,krange=2:25)
save(fitPamBest,file='../data/fitPamBest_k2-25.Rdata')
write.table(as.matrix(fitPamBest$pamobject$clustering),paste("../data/pam_clusters_k",fitPamBest$nc,".txt",sep=""),quote=FALSE,sep='\t',col.names=NA,row.names=TRUE)
#[1] 0.0000000 0.6994101 0.5969360 0.6976904 0.5068908 0.4770732 0.4372348
#[8] 0.4513308 0.4319778 0.4467728 0.4305388 0.3842136 0.3688111 0.3674104
#[15] 0.3331504 0.3097679 0.3130108 0.3412444 0.3377852 0.3110066 0.3153869
#[22] 0.3001647 0.2838108 0.2866504 0.2949700
fit <- pam(df.num,2)
print(summary(silhouette(fit)))
## Silhouette of 10780 units in 2 clusters from pam(x = df.num, k = 2) :
## Cluster sizes and average silhouette widths:
## 9112 1668
## 0.7384107 0.8227154
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04206 0.70796 0.80226 0.75146 0.80990 0.88255
fit <- pam(df.num,4)
print(summary(silhouette(fit)))
## Silhouette of 10780 units in 4 clusters from pam(x = df.num, k = 4) :
## Cluster sizes and average silhouette widths:
## 803 1666 5587 2724
## 0.7479980 0.8196234 0.8587196 0.7966747
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.008044 0.816291 0.856216 0.828752 0.882189 0.903315
distMat <-dist(df.num)
fitH <- hclust(distMat)
SIbest=0
kbest=0
for(k in 2:25){
si=summary(silhouette(cutree(fitH,k=k),distMat))$avg.width
if(si>SIbest){
SIbest=si
kbest=k
}
print(paste(k,si,sep=':'))
}
## [1] "2:0.751455283570779"
## [1] "3:0.74847998211438"
## [1] "4:0.622720089625695"
## [1] "5:0.683254757159377"
## [1] "6:0.682465212661397"
## [1] "7:0.680032957416679"
## [1] "8:0.679604274071885"
## [1] "9:0.664318000860433"
## [1] "10:0.813294970511758"
## [1] "11:0.820027772126945"
## [1] "12:0.822651166977458"
## [1] "13:0.822427652636859"
## [1] "14:0.822642117309186"
## [1] "15:0.823881931204672"
## [1] "16:0.823959863839715"
## [1] "17:0.824214716658879"
## [1] "18:0.821680179851846"
## [1] "19:0.821810108215891"
## [1] "20:0.821771637422601"
## [1] "21:0.822016412264233"
## [1] "22:0.821903229749889"
## [1] "23:0.821952190236938"
## [1] "24:0.821693270568127"
## [1] "25:0.821722309859649"
si<-silhouette(cutree(fitH,k=kbest),distMat)
summary(si)
## Silhouette of 10780 units in 17 clusters from silhouette.default(x = cutree(fitH, k = kbest), dist = distMat) :
## Cluster sizes and average silhouette widths:
## 801 2 17 3 5 7
## 0.75249347 0.32024415 0.43875582 0.48093039 0.53719431 0.31436406
## 2 5532 59 1631 1 31
## 0.01565035 0.87380033 0.28647517 0.71927797 0.00000000 0.37667441
## 20 15 2646 1 7
## 0.29207112 0.33543354 0.83892294 0.00000000 0.22567109
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.2233 0.8038 0.8625 0.8242 0.8904 0.9120
fitH.labelModels=fitH
fitH$labels=gsub('_.*','',fitH$labels)
library(RColorBrewer)
my.palette <- brewer.pal(kbest,"Paired")
## Warning in brewer.pal(kbest, "Paired"): n too large, allowed maximum for palette Paired is 12
## Returning the palette you asked for with that many colors
cols <- colorRampPalette(my.palette)(kbest)
clusK=cutree(fitH,kbest)
plot(as.phylo(fitH), type = "fan", cex = 0.6, label.offset = 0.3, no.margin=TRUE, tip.color = cols[clusK])

plot(as.phylo(fitH), type='unrooted', cex=0.5, label.offset=0.5, no.margin=TRUE, tip.color = cols[clusK])

groups <- as.factor(cutree(fitH.labelModels, k = kbest))
write.table(
as.matrix(groups),
paste("../data/hclust_clusters_k", kbest, ".txt", sep = ""),
quote = FALSE,
sep = '\t',
col.names = NA,
row.names = TRUE
)
library(RColorBrewer)
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.10.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following objects are masked from 'package:ape':
##
## ladderize, rotate
## The following object is masked from 'package:stats':
##
## cutree
colsCluster=colorspace::rainbow_hcl(kbest, c = 70, l = 50)
clusK=cutree(fitH,kbest,order_clusters_as_data = FALSE)
# define dendrogram
fitH.dend=as.dendrogram(fitH)
collec=labels(fitH.dend)
# Specify different point types and colors for each leave
dend <- fitH.dend %>%
set("leaves_pch", 19) %>% # node point type
set("leaves_cex", 0.4) %>% # node point size
#set("leaves_col", colsCollection[as.factor(fitH$labels)]) %>% #node point color
set("labels", "") %>%
set("branches_k_color", colsCluster, k = 10)
## Warning in `labels<-.dendrogram`(dend, value = value, ...): The lengths
## of the new labels is shorter than the number of leaves in the dendrogram -
## labels are recycled.
## Warning in get_col(col, k): Length of color vector was longer than the
## number of clusters - first k elements are used
plot(dend)
# Add the colored bar
# Create a vector giving a color for each model collection
# Inspired by: https://cran.r-project.org/web/packages/dendextend/vignettes/FAQ.html
collect_type <- rep("Other", length(rownames(df.num)))
is_x <- grepl("agora", rownames(df.num))
collect_type[is_x] <- "agora"
is_x <- grepl("bigg", rownames(df.num))
collect_type[is_x] <- "bigg"
is_x <- grepl("ebrahim", rownames(df.num))
collect_type[is_x] <- "ebrahim"
is_x <- grepl("embl", rownames(df.num))
collect_type[is_x] <- "embl"
is_x <- grepl("path", rownames(df.num))
collect_type[is_x] <- "path"
is_x <- grepl("seed", rownames(df.num))
collect_type[is_x] <- "seed"
is_x <- grepl("uminho", rownames(df.num))
collect_type[is_x] <- "uminho"
collect_type <- factor(collect_type)
n_collect_types <- length(unique(collect_type))
col_collect_type <- colsCollection[collect_type]
colored_bars(col_collect_type, dend, rowLabels = "Collection")

library(ggplot2)
ggd1 <- as.ggdend(dend)
# Create a radial plot and remove labels
ggplot(ggd1, labels = FALSE) +
scale_y_reverse(expand = c(0.2, 0)) +
coord_polar(theta = "x")
## Warning: Removed 10779 rows containing missing values (geom_point).

Discriminant factor of clusters
df.rf = df.num
df.rf$cluster = as.factor(cutree(fitH, kbest))
set.seed(123)
rf_file = "../data/rf_model_classCluster.Rdata"
if (!file.exists(rf_file)) {
model.cl <-
train(
form = cluster ~ .,
data = df.rf,
trControl = train_control,
method = "rf",
ntree = 1000,
importance = TRUE,
localImp = TRUE,
na.action = na.omit,
do.trace = 10
)
save(model.cl, file = rf_file)
} else {
load(rf_file)
}
print(model.cl$finalModel)
##
## Call:
## randomForest(x = x, y = y, ntree = 1000, mtry = param$mtry, importance = TRUE, localImp = TRUE, do.trace = 10)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 65
##
## OOB estimate of error rate: 0.12%
## Confusion matrix:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 class.error
## 1 801 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0000000000
## 2 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1.0000000000
## 3 0 0 17 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0000000000
## 4 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0000000000
## 5 0 0 1 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0.2000000000
## 6 0 1 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0.1428571429
## 7 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1.0000000000
## 8 0 0 0 0 0 0 0 5531 1 0 0 0 0 0 0 0 0 0.0001807664
## 9 0 0 0 0 0 0 0 0 59 0 0 0 0 0 0 0 0 0.0000000000
## 10 0 0 0 0 0 0 0 0 0 1631 0 0 0 0 0 0 0 0.0000000000
## 11 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1.0000000000
## 12 0 0 0 0 0 0 0 0 0 0 0 30 0 1 0 0 0 0.0322580645
## 13 0 0 0 0 0 0 0 0 0 0 0 0 20 0 0 0 0 0.0000000000
## 14 0 0 0 0 0 0 0 0 0 0 0 0 0 14 1 0 0 0.0666666667
## 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2646 0 0 0.0000000000
## 16 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1.0000000000
## 17 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 5 0.2857142857
# Variable Importance
varImpPlot(model.cl$finalModel,type=1)

# Confusion matrix
print(confusionMatrix(model.cl$pred$pred,model.cl$pred$obs))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10
## 1 1923 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 2 0 0 0 0
## 3 0 2 39 3 2 6 3 0 0 0
## 4 0 0 0 3 0 0 0 0 0 0
## 5 0 0 0 0 10 0 0 0 0 0
## 6 0 2 0 0 0 7 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0 0
## 8 0 1 0 0 0 0 0 13273 47 0
## 9 0 0 0 0 0 0 0 2 94 0
## 10 0 0 0 0 0 0 0 0 0 3915
## 11 0 0 0 0 0 0 0 0 0 0
## 12 0 0 0 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0 0 0 0
## 14 0 0 0 0 0 0 0 0 0 0
## 15 0 1 0 0 0 0 0 0 0 0
## 16 0 0 0 0 0 0 0 0 0 0
## 17 0 0 0 0 0 0 0 0 0 0
## Reference
## Prediction 11 12 13 14 15 16 17
## 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0
## 8 0 25 16 13 9 0 6
## 9 0 0 0 0 0 0 0
## 10 0 0 0 0 0 0 0
## 11 0 0 0 0 0 0 0
## 12 0 49 0 1 0 0 0
## 13 0 0 32 0 0 0 0
## 14 0 0 0 25 0 0 0
## 15 0 1 0 0 6342 0 0
## 16 0 0 0 0 0 0 0
## 17 0 0 0 0 0 0 12
##
## Overall Statistics
##
## Accuracy : 0.9945
## 95% CI : (0.9935, 0.9954)
## No Information Rate : 0.5132
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9915
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 1.00000 0.000e+00 1.000000 0.500000 0.8333333
## Specificity 1.00000 9.999e-01 0.999380 1.000000 1.0000000
## Pos Pred Value 1.00000 0.000e+00 0.709091 1.000000 1.0000000
## Neg Pred Value 1.00000 9.998e-01 1.000000 0.999884 0.9999226
## Prevalence 0.07434 2.320e-04 0.001508 0.000232 0.0004639
## Detection Rate 0.07434 0.000e+00 0.001508 0.000116 0.0003866
## Detection Prevalence 0.07434 7.732e-05 0.002126 0.000116 0.0003866
## Balanced Accuracy 1.00000 5.000e-01 0.999690 0.750000 0.9166667
## Class: 6 Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity 0.4666667 0.000000 0.9998 0.666667 1.0000
## Specificity 0.9999226 1.000000 0.9907 0.999922 1.0000
## Pos Pred Value 0.7777778 NaN 0.9913 0.979167 1.0000
## Neg Pred Value 0.9996906 0.999884 0.9998 0.998176 1.0000
## Prevalence 0.0005799 0.000116 0.5132 0.005451 0.1514
## Detection Rate 0.0002706 0.000000 0.5131 0.003634 0.1514
## Detection Prevalence 0.0003479 0.000000 0.5177 0.003711 0.1514
## Balanced Accuracy 0.7332947 0.500000 0.9953 0.833294 1.0000
## Class: 11 Class: 12 Class: 13 Class: 14 Class: 15
## Sensitivity NA 0.653333 0.666667 0.6410256 0.9986
## Specificity 1 0.999961 1.000000 1.0000000 0.9999
## Pos Pred Value NA 0.980000 1.000000 1.0000000 0.9997
## Neg Pred Value NA 0.998993 0.999381 0.9994582 0.9995
## Prevalence 0 0.002900 0.001856 0.0015078 0.2455
## Detection Rate 0 0.001894 0.001237 0.0009665 0.2452
## Detection Prevalence 0 0.001933 0.001237 0.0009665 0.2453
## Balanced Accuracy NA 0.826647 0.833333 0.8205128 0.9992
## Class: 16 Class: 17
## Sensitivity NA 0.6666667
## Specificity 1 1.0000000
## Pos Pred Value NA 1.0000000
## Neg Pred Value NA 0.9997679
## Prevalence 0 0.0006959
## Detection Rate 0 0.0004639
## Detection Prevalence 0 0.0004639
## Balanced Accuracy NA 0.8333333
model=model.cl$finalModel
pred=model.cl$pred
library(reshape2)
colVector=c(
"#DB9D85",
"#E2979B",
"#E494B2",
"#DF94C6",
"#D297D5",
"#BD9EDF",
"#A2A7E2",
"#80B0DE",
"#5CB7D3",
"#3EBCC3",
"#3ABEAF",
"#52BE99",
"#70BB84",
"#8DB771",
"#A7B166",
"#BCAB66",
"#CEA472",
"#DB9D85")
# agora, bigg, bigg, bigg, ebrahim+path+uminho, ~embl, embl+path, path, seed, seed
plot.rf.var.importance.by.class.andMean.dotplot(model,'test','cluster',colVector)

plot.rf.var.importance.by.class.andMean.heatmap(model,'test','cluster','Feature importance (Mean Decreasy in Accuracy in Random Forest)')
